In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display
In [2]:
#import the data
Airbnb = pd.read_csv("../Project/archive/AB_NYC_2019.csv")
In [3]:
Airbnb.shape
Out[3]:
(48895, 16)
In [4]:
Airbnb.head()
Out[4]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [5]:
Airbnb.isnull().sum()
Out[5]:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64
In [6]:
Airbnb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  object 
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB
In [7]:
Airbnb[["price","neighbourhood_group","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]].describe(include=["object","datetime64","float","int","int64"])
#noticed that there are 0 values in the price
Out[7]:
price neighbourhood_group neighbourhood room_type minimum_nights number_of_reviews reviews_per_month last_review calculated_host_listings_count availability_365
count 48895.000000 48895 48895 48895 48895.000000 48895.000000 38843.000000 38843 48895.000000 48895.000000
unique NaN 5 221 3 NaN NaN NaN 1764 NaN NaN
top NaN Manhattan Williamsburg Entire home/apt NaN NaN NaN 2019-06-23 NaN NaN
freq NaN 21661 3920 25409 NaN NaN NaN 1413 NaN NaN
mean 152.720687 NaN NaN NaN 7.029962 23.274466 1.373221 NaN 7.143982 112.781327
std 240.154170 NaN NaN NaN 20.510550 44.550582 1.680442 NaN 32.952519 131.622289
min 0.000000 NaN NaN NaN 1.000000 0.000000 0.010000 NaN 1.000000 0.000000
25% 69.000000 NaN NaN NaN 1.000000 1.000000 0.190000 NaN 1.000000 0.000000
50% 106.000000 NaN NaN NaN 3.000000 5.000000 0.720000 NaN 1.000000 45.000000
75% 175.000000 NaN NaN NaN 5.000000 24.000000 2.020000 NaN 2.000000 227.000000
max 10000.000000 NaN NaN NaN 1250.000000 629.000000 58.500000 NaN 327.000000 365.000000
In [8]:
#get the data ( count and view) for zero values
Airbnb[Airbnb["price"] != 0].nunique()
Out[8]:
id                                48884
name                              47894
host_id                           37455
host_name                         11450
neighbourhood_group                   5
neighbourhood                       221
latitude                          19046
longitude                         14715
room_type                             3
price                               673
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64
In [9]:
#drop name and host_name column
Airbnb_clean = Airbnb.drop(["name","host_name"],axis=1).copy()
#fill the missing data in reviews_per_month column
Airbnb_clean.fillna({'reviews_per_month':0}, inplace=True)
#changing the data typ for last_review
Airbnb_clean["last_review"] = pd.to_datetime(Airbnb_clean["last_review"])
In [10]:
Airbnb_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48895 non-null  int64         
 1   host_id                         48895 non-null  int64         
 2   neighbourhood_group             48895 non-null  object        
 3   neighbourhood                   48895 non-null  object        
 4   latitude                        48895 non-null  float64       
 5   longitude                       48895 non-null  float64       
 6   room_type                       48895 non-null  object        
 7   price                           48895 non-null  int64         
 8   minimum_nights                  48895 non-null  int64         
 9   number_of_reviews               48895 non-null  int64         
 10  last_review                     38843 non-null  datetime64[ns]
 11  reviews_per_month               48895 non-null  float64       
 12  calculated_host_listings_count  48895 non-null  int64         
 13  availability_365                48895 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(3)
memory usage: 5.2+ MB
In [11]:
Airbnb_clean.head()
Out[11]:
id host_id neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 2787 Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 2845 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 4632 Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaT 0.00 1 365
3 3831 4869 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 7192 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [12]:
Airbnb_clean[["price","neighbourhood_group","latitude","longitude","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]].describe(include=["object","datetime64","float","int","int64"])
Out[12]:
price neighbourhood_group latitude longitude neighbourhood room_type minimum_nights number_of_reviews reviews_per_month last_review calculated_host_listings_count availability_365
count 48895.000000 48895 48895.000000 48895.000000 48895 48895 48895.000000 48895.000000 48895.000000 38843 48895.000000 48895.000000
unique NaN 5 NaN NaN 221 3 NaN NaN NaN NaN NaN NaN
top NaN Manhattan NaN NaN Williamsburg Entire home/apt NaN NaN NaN NaN NaN NaN
freq NaN 21661 NaN NaN 3920 25409 NaN NaN NaN NaN NaN NaN
mean 152.720687 NaN 40.728949 -73.952170 NaN NaN 7.029962 23.274466 1.090910 2018-10-04 01:47:23.910099456 7.143982 112.781327
min 0.000000 NaN 40.499790 -74.244420 NaN NaN 1.000000 0.000000 0.000000 2011-03-28 00:00:00 1.000000 0.000000
25% 69.000000 NaN 40.690100 -73.983070 NaN NaN 1.000000 1.000000 0.040000 2018-07-08 00:00:00 1.000000 0.000000
50% 106.000000 NaN 40.723070 -73.955680 NaN NaN 3.000000 5.000000 0.370000 2019-05-19 00:00:00 1.000000 45.000000
75% 175.000000 NaN 40.763115 -73.936275 NaN NaN 5.000000 24.000000 1.580000 2019-06-23 00:00:00 2.000000 227.000000
max 10000.000000 NaN 40.913060 -73.712990 NaN NaN 1250.000000 629.000000 58.500000 2019-07-08 00:00:00 327.000000 365.000000
std 240.154170 NaN 0.054530 0.046157 NaN NaN 20.510550 44.550582 1.597283 NaN 32.952519 131.622289
In [13]:
Airbnb_clean[Airbnb["price"]==0]
Out[13]:
id host_id neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
23161 18750597 8993084 Brooklyn Bedford-Stuyvesant 40.69023 -73.95428 Private room 0 4 1 2018-01-06 0.05 4 28
25433 20333471 131697576 Bronx East Morrisania 40.83296 -73.88668 Private room 0 2 55 2019-06-24 2.56 4 127
25634 20523843 15787004 Brooklyn Bushwick 40.69467 -73.92433 Private room 0 2 16 2019-05-18 0.71 5 0
25753 20608117 1641537 Brooklyn Greenpoint 40.72462 -73.94072 Private room 0 2 12 2017-10-27 0.53 2 0
25778 20624541 10132166 Brooklyn Williamsburg 40.70838 -73.94645 Entire home/apt 0 5 3 2018-01-02 0.15 1 73
25794 20639628 86327101 Brooklyn Bedford-Stuyvesant 40.68173 -73.91342 Private room 0 1 93 2019-06-15 4.28 6 176
25795 20639792 86327101 Brooklyn Bedford-Stuyvesant 40.68279 -73.91170 Private room 0 1 95 2019-06-21 4.37 6 232
25796 20639914 86327101 Brooklyn Bedford-Stuyvesant 40.68258 -73.91284 Private room 0 1 95 2019-06-23 4.35 6 222
26259 20933849 13709292 Manhattan Murray Hill 40.75091 -73.97597 Entire home/apt 0 3 0 NaT 0.00 1 0
26841 21291569 101970559 Brooklyn Bushwick 40.69211 -73.90670 Shared room 0 30 2 2019-06-22 0.11 6 333
26866 21304320 101970559 Brooklyn Bushwick 40.69166 -73.90928 Shared room 0 30 5 2019-05-24 0.26 6 139
In [14]:
Airbnb_clean[Airbnb_clean["price"]==0]["price"].value_counts()
Out[14]:
price
0    11
Name: count, dtype: int64
In [15]:
# have all numeric data pairs scatter plot to se the correlation
sns.set()
columns=["price","neighbourhood_group","latitude","longitude","neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]
facet_scatter = sns.pairplot(Airbnb_clean[columns], height=2, kind='scatter', diag_kind='kde')
plt.xticks(rotation=45, horizontalalignment='right')
plt.show()
In [16]:
# plot the correlation between numeric data
corr_cols = ["price","latitude","longitude","minimum_nights","number_of_reviews","reviews_per_month","last_review","calculated_host_listings_count","availability_365"]
correlation = Airbnb_clean[corr_cols].corr()
correlation.style.background_gradient(axis=None)
Out[16]:
  price latitude longitude minimum_nights number_of_reviews reviews_per_month last_review calculated_host_listings_count availability_365
price 1.000000 0.033939 -0.150019 0.042799 -0.047954 -0.050564 -0.085239 0.057472 0.081829
latitude 0.033939 1.000000 0.084788 0.024869 -0.015389 -0.018758 -0.029530 0.019517 -0.010983
longitude -0.150019 0.084788 1.000000 -0.062747 0.059094 0.138516 0.044203 -0.114713 0.082731
minimum_nights 0.042799 0.024869 -0.062747 1.000000 -0.080116 -0.124905 -0.111649 0.127960 0.144303
number_of_reviews -0.047954 -0.015389 0.059094 -0.080116 1.000000 0.589407 0.267759 -0.072376 0.172028
reviews_per_month -0.050564 -0.018758 0.138516 -0.124905 0.589407 1.000000 0.350466 -0.047312 0.163732
last_review -0.085239 -0.029530 0.044203 -0.111649 0.267759 0.350466 1.000000 -0.117821 0.033440
calculated_host_listings_count 0.057472 0.019517 -0.114713 0.127960 -0.072376 -0.047312 -0.117821 1.000000 0.225701
availability_365 0.081829 -0.010983 0.082731 0.144303 0.172028 0.163732 0.033440 0.225701 1.000000
In [17]:
top_host=Airbnb_clean.host_id.value_counts().head(10)
Airbnb_clean.host_id.value_counts().head(10)
Out[17]:
host_id
219517861    327
107434423    232
30283594     121
137358866    103
16098958      96
12243051      96
61391963      91
22541573      87
200380610     65
7503643       52
Name: count, dtype: int64
In [18]:
top_host.plot(
    kind="bar",
    title="top listing ",
    ylabel="count",
    xlabel="host ID"
);
In [19]:
# count how many type of rooms available
Airbnb_clean["room_type"].unique()
Airbnb_clean["room_type"].value_counts()
Out[19]:
room_type
Entire home/apt    25409
Private room       22326
Shared room         1160
Name: count, dtype: int64
In [20]:
Airbnb_clean["room_type"].value_counts().plot(
    kind="bar",
    title="type of room count",
    ylabel="count",
    xlabel="room type"
);
In [21]:
#box plot for the price vs the type of room to see the the outliers
px.box(Airbnb_clean,x="room_type",y="price",title="price vs type of room");
In [22]:
Airbnb['price'].quantile([0.05, 0.25, 0.5, 0.75, 0.95])
Out[22]:
0.05     40.0
0.25     69.0
0.50    106.0
0.75    175.0
0.95    355.0
Name: price, dtype: float64
In [23]:
Airbnb_clean_hist=px.histogram(Airbnb_clean,x="price",facet_col="room_type",)
Airbnb_clean_hist.show()
In [24]:
px.box(Airbnb_clean,x="room_type",y="price")
In [25]:
#removing all the values above 400 to remove the outliers , from the above , we see that the highest wesiker is 392, according to Bruce, Bruce and Gedeck (2020) , what ever above that is outlier
Airbnb_clean_outliers = Airbnb_clean.query("price < 355 and price > 40").copy()
In [26]:
Airbnb_clean_outliers_hist=px.histogram(Airbnb_clean_outliers,x="price",facet_col="room_type",nbins=30)
Airbnb_clean_outliers_hist.show()
In [27]:
px.box(Airbnb_clean_outliers,x="room_type",y="price")
In [28]:
Airbnb_clean_outliers_hist= px.histogram(Airbnb_clean_outliers,x="price",nbins=50)
Airbnb_clean_outliers_hist.show()
In [29]:
#get thee number of listing in each neighbourhood_group
neighbourhood_group_bar=Airbnb_clean["neighbourhood_group"].value_counts()
Airbnb_clean["neighbourhood_group"].value_counts()
Out[29]:
neighbourhood_group
Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64
In [30]:
neighbourhood_group_bar.plot(
    kind="bar",
    title="amount of listing in each neighbourhood group",
    ylabel="count",
    xlabel="neighbourhood_group"

);
In [32]:
#get thee number of listing in each neighbourhood
neighbourhood_vis=Airbnb_clean["neighbourhood"].value_counts().head(30)
neighbourhood_vis_df =pd.DataFrame(neighbourhood_vis)
neighbourhood_vis_df.reset_index(inplace=True)
# neighbourhood_vis_df.rename(columns={'index':'neighbourhood', 'neighbourhood':'Count'}, inplace=True)
neighbourhood_vis_df

# Airbnb_clean["neighbourhood"].value_counts().head(30)
Out[32]:
neighbourhood count
0 Williamsburg 3920
1 Bedford-Stuyvesant 3714
2 Harlem 2658
3 Bushwick 2465
4 Upper West Side 1971
5 Hell's Kitchen 1958
6 East Village 1853
7 Upper East Side 1798
8 Crown Heights 1564
9 Midtown 1545
10 East Harlem 1117
11 Greenpoint 1115
12 Chelsea 1113
13 Lower East Side 911
14 Astoria 900
15 Washington Heights 899
16 West Village 768
17 Financial District 744
18 Flatbush 621
19 Clinton Hill 572
20 Long Island City 537
21 Prospect-Lefferts Gardens 535
22 Park Slope 506
23 East Flatbush 500
24 Fort Greene 489
25 Murray Hill 485
26 Kips Bay 470
27 Flushing 426
28 Ridgewood 423
29 Greenwich Village 392
In [35]:
plt.figure(figsize=(10, 6))  # Adjust the numbers to change the figure size
neighbourhood_vis = sns.barplot(x="neighbourhood", y="Count", data=neighbourhood_vis_df, palette='Blues_d')
neighbourhood_vis.set_title('count of listing in each neighbourhood')
neighbourhood_vis.set_ylabel('Count of listings')
neighbourhood_vis.set_xlabel('neighbourhood')
neighbourhood_vis.set_xticklabels(neighbourhood_vis.get_xticklabels(), rotation=90);
In [34]:
# create Dataframe for the number of listings
nei_group = Airbnb_clean.groupby("neighbourhood_group")["neighbourhood"].value_counts().reset_index(name="count").copy()
nei_group_outliers = Airbnb_clean_outliers.groupby("neighbourhood_group")["neighbourhood"].value_counts().reset_index(name="count").copy()

nei_group
# px.bar(nei_group)
Out[34]:
neighbourhood_group neighbourhood count
0 Bronx Kingsbridge 70
1 Bronx Fordham 63
2 Bronx Longwood 62
3 Bronx Mott Haven 60
4 Bronx Wakefield 50
... ... ... ...
216 Staten Island Richmondtown 1
217 Staten Island New Dorp 1
218 Staten Island Fort Wadsworth 1
219 Staten Island Willowbrook 1
220 Staten Island Woodrow 1

221 rows × 3 columns

In [35]:
Airbnb_clean_listing = px.bar(nei_group, x="neighbourhood",y="count",color="neighbourhood_group",labels={'count': 'count of listings'})
Airbnb_clean_listing.show()
In [36]:
Airbnb_clean_outliers_listing= px.bar(nei_group_outliers, x="neighbourhood",y="count",color="neighbourhood_group",labels={'count': 'count of listings'})
Airbnb_clean_outliers_listing.show()
In [37]:
#scatter plot to know which neighbourhood has the most expensive listing
Airbnb_clean_facet = px.scatter(Airbnb_clean, x = "neighbourhood", y = "price", color="neighbourhood_group" , facet_col="room_type" )
Airbnb_clean_facet.show()
In [38]:
#scatter plot to know which neighbourhood has the most expensive listing aften removing the outliers
Airbnb_clean_outliers_facet = px.scatter(Airbnb_clean_outliers, x = "neighbourhood", y = "price", color="neighbourhood_group" )
Airbnb_clean_outliers_facet.show()
In [39]:
#see geografically where is the listings
scatter_mapbox_clean = px.scatter_mapbox(
    Airbnb_clean_outliers,
    lat="latitude",
    lon="longitude",
    color="price",
    mapbox_style="open-street-map",
    hover_data="price",
    hover_name="room_type",
    height=800,
    title="distribution of listings as per the price",



)
scatter_mapbox_clean.show()
In [40]:
scatter_mapbox_clean = px.scatter_mapbox(
    Airbnb_clean_outliers,
    lat="latitude",
    lon="longitude",
    color="neighbourhood_group",
    mapbox_style="open-street-map",
    hover_data="price",
    hover_name="room_type",
    height=800,
    title="distribution of listings for each neighbourhood_group",



)
scatter_mapbox_clean.show()
In [43]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns

# Load the data
Airbnb = pd.read_csv("/content/AB_NYC_2019.csv")

# Extract relevant features for clustering
features = ["room_type", "latitude", "longitude", "price", "number_of_reviews", "calculated_host_listings_count"]
X_airbnb = Airbnb[features].values

# Define column indices for numerical and categorical features
numerical_cols = [1, 2, 3, 4, 5]
categorical_cols = [0]

# Create transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Apply the preprocessor to the data and remove rows with missing values
X_airbnb_encoded = preprocessor.fit_transform(pd.DataFrame(X_airbnb))
X_airbnb_encoded = X_airbnb_encoded[~np.isnan(X_airbnb_encoded).any(axis=1)]

# Perform K-means clustering with the optimal K
optimal_k = 7
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_airbnb_encoded)

# Convert the data and cluster labels to a DataFrame for Seaborn
feature_names = ["Room Type", "Latitude", "Longitude", "Price", "Number of Reviews", "Host Listings Count"]
data_with_clusters = pd.DataFrame(X_airbnb, columns=feature_names)
data_with_clusters['Cluster'] = clusters

# Add the 'neighbourhood_group' column to data_with_clusters
data_with_clusters['neighbourhood_group'] = Airbnb['neighbourhood_group']

# Labeling categorical dummy variables for descriptive statistics
data_with_clusters["Room Type"] = X_airbnb[:, 0]
data_with_clusters.head()
#cluster_stats_numeric = data_with_clusters.groupby('Cluster').apply(lambda x: x.describe()).transpose()
cluster_stats = data_with_clusters.groupby('Cluster').agg(['min', 'max', 'mean', 'std']).transpose()

numerical_columns = ['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']
data_with_clusters[numerical_columns] = data_with_clusters[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Percentage of each neighbourhood_group for each cluster
room_type_percentage = data_with_clusters.groupby(['Cluster', 'Room Type']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
neighbourhood_percentage = data_with_clusters.groupby(['Cluster', 'neighbourhood_group']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
pd.set_option('display.max_columns', None)

print(cluster_stats)
print(room_type_percentage.T)
print(neighbourhood_percentage.T)

# Create five violin plots with the same category variable
fig, axes = plt.subplots(5, 1, figsize=(5, 20), sharey=False)

# Plot each numerical variable separately
for i, value_col in enumerate(['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']):
    sns.violinplot(x='Cluster', y=value_col, data=data_with_clusters, ax=axes[i])
    axes[i].set_title(f'Violin Plot for {value_col}')

# Show the plots
plt.show()

# Visualize clusters using pairplot with different colors for each cluster
sns.pairplot(data_with_clusters, hue='Cluster', palette='viridis', vars=feature_names[1:])
plt.show()

# PCA Visualization with different colors for each cluster
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(X_airbnb_encoded)
data_with_clusters_pca = pd.DataFrame(np.column_stack([pca_result, clusters]), columns=['PCA Component 1', 'PCA Component 2', 'Cluster'])
sns.scatterplot(data=data_with_clusters_pca, x='PCA Component 1', y='PCA Component 2', hue='Cluster', palette='viridis', s=50, alpha=0.8)
plt.title('PCA Visualization of KMeans Clusters')
plt.show()

# K-means clusters based on latitude and longitude, colored by cluster using original dataset
plt.scatter(Airbnb['latitude'], Airbnb['longitude'], c=clusters, cmap='viridis', s=50, alpha=0.8)
plt.title('K-means Clusters Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

<ipython-input-43-ab1566537851>:55: FutureWarning:

['Room Type', 'neighbourhood_group'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.

Cluster                             0             1            2            3  \
Latitude            min     40.506410     40.583630    40.735500    40.538710   
                    max     40.822630     40.825110    40.913060    40.898110   
                    mean    40.713820     40.733868    40.801172    40.723546   
                    std      0.037672      0.045229     0.032190     0.050113   
Longitude           min    -74.244420    -74.097300   -73.999940   -74.169660   
                    max    -73.837460    -73.770690   -73.829320   -73.746270   
                    mean   -73.973330    -73.977600   -73.942120   -73.953578   
                    std      0.028495      0.039361     0.026886     0.039106   
Price               min      0.000000   2545.000000     0.000000     0.000000   
                    max   2500.000000  10000.000000  2500.000000  1050.000000   
                    mean   224.457766   4820.015152   115.564625   128.122311   
                    std    198.782479   2237.169651    83.734338    83.091550   
Number of Reviews   min      0.000000      0.000000     0.000000    81.000000   
                    max    109.000000     69.000000   143.000000   629.000000   
                    mean    12.727838      2.651515    15.054296   157.637984   
                    std     19.116081      8.882987    22.210175    66.333955   
Host Listings Count min      1.000000      1.000000     1.000000     1.000000   
                    max    121.000000     12.000000   103.000000    28.000000   
                    mean     5.382564      2.772727     2.976965     1.980332   
                    std     17.264435      3.724329     8.968432     1.896065   

Cluster                            4           5            6  
Latitude            min    40.703720   40.499790    40.565460  
                    max    40.790940   40.763220    40.866460  
                    mean   40.729629   40.695390    40.710963  
                    std     0.023925    0.034129     0.050271  
Longitude           min   -74.017120  -74.242850   -73.900020  
                    max   -73.949100  -73.884260   -73.712990  
                    mean  -73.996995  -73.956629   -73.826358  
                    std     0.014972    0.034042     0.044847  
Price               min   100.000000    0.000000    10.000000  
                    max   699.000000  800.000000  1500.000000  
                    mean  273.928444   79.142267    95.403955  
                    std   100.975965   47.371032    90.731393  
Number of Reviews   min     0.000000    0.000000     0.000000  
                    max    20.000000  100.000000   212.000000  
                    mean    2.343470   12.143374    24.406780  
                    std     3.775523   18.743582    30.796190  
Host Listings Count min   232.000000    1.000000     1.000000  
                    max   327.000000  103.000000   103.000000  
                    mean  287.572451    3.256109     3.526483  
                    std    46.850957    8.457560     8.337037  
                         0          1          2          3          4  \
Room Type                                                                
Entire home/apt  98.463190  81.818182  41.261426  53.657037  98.568873   
Private room      0.813605  18.181818  54.890311  44.714198   1.431127   
Shared room       0.723205   0.000000   3.848263   1.628765   0.000000   

                         5          6  
Room Type                              
Entire home/apt   0.465116  37.217514  
Private room     96.293835  58.580508  
Shared room       3.241048   4.201977  
                             0          1          2          3          4  \
neighbourhood_group                                                          
Bronx                 0.000000   0.000000   8.930530   1.167793   0.000000   
Brooklyn             47.771061  22.727273   0.000000  47.910264   0.357782   
Manhattan            49.104469  72.727273  75.740402  39.551321  99.642218   
Queens                2.180914   3.030303  15.329068  10.540873   0.000000   
Staten Island         0.943556   1.515152   0.000000   0.829748   0.000000   

                             5          6  
neighbourhood_group                        
Bronx                 0.000000   2.683616  
Brooklyn             72.595054   8.474576  
Manhattan            20.612772   0.000000  
Queens                5.478036  88.841808  
Staten Island         1.314138   0.000000  
In [44]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X_airbnb_encoded) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=10)
    cluster_labels = clusterer.fit_predict(X_airbnb_encoded)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X_airbnb_encoded, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_airbnb_encoded, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X_airbnb_encoded[:, 0], X_airbnb_encoded[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()
For n_clusters = 2 The average silhouette_score is : 0.1909706201676494
For n_clusters = 3 The average silhouette_score is : 0.19924914334117888
For n_clusters = 4 The average silhouette_score is : 0.21436777840705593
For n_clusters = 5 The average silhouette_score is : 0.2164208271656542
For n_clusters = 6 The average silhouette_score is : 0.24978012466829047
For n_clusters = 7 The average silhouette_score is : 0.28402181291564366
For n_clusters = 8 The average silhouette_score is : 0.28298092537662833
In [45]:
# Determine the range of K values to explore
k_values = range(1, 11)

# Calculate inertias for different K values
inertias = [KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_airbnb_encoded).inertia_ for k in k_values]

# Plot the elbow curve
plt.plot(k_values, inertias, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()

Repeat clustering with removal of 0 price rooms

In [47]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns

# Load the data
Airbnb = pd.read_csv("/content/AB_NYC_2019.csv")

# Remove rows with 0 value that do not make sense
Airbnb = Airbnb[Airbnb['price'] != 0]

# Extract relevant features for clustering
features = ["room_type", "latitude", "longitude", "price", "number_of_reviews", "calculated_host_listings_count"]
X_airbnb = Airbnb[features].values

# Define column indices for numerical and categorical features
numerical_cols = [1, 2, 3, 4, 5]
categorical_cols = [0]

# Create transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Apply the preprocessor to the data and remove rows with missing values
X_airbnb_encoded = preprocessor.fit_transform(pd.DataFrame(X_airbnb))
X_airbnb_encoded = X_airbnb_encoded[~np.isnan(X_airbnb_encoded).any(axis=1)]

# Perform K-means clustering with the optimal K
optimal_k = 7
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_airbnb_encoded)

# Convert the data and cluster labels to a DataFrame for Seaborn
feature_names = ["Room Type", "Latitude", "Longitude", "Price", "Number of Reviews", "Host Listings Count"]
data_with_clusters = pd.DataFrame(X_airbnb, columns=feature_names)
data_with_clusters['Cluster'] = clusters

# Add the 'neighbourhood_group' column to data_with_clusters
data_with_clusters['neighbourhood_group'] = Airbnb['neighbourhood_group']

# Labeling categorical dummy variables for descriptive statistics
data_with_clusters["Room Type"] = X_airbnb[:, 0]
data_with_clusters.head()
#cluster_stats_numeric = data_with_clusters.groupby('Cluster').apply(lambda x: x.describe()).transpose()
cluster_stats = data_with_clusters.groupby('Cluster').agg(['min', 'max', 'mean', 'std']).transpose()

numerical_columns = ['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']
data_with_clusters[numerical_columns] = data_with_clusters[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Percentage of each neighbourhood_group for each cluster
room_type_percentage = data_with_clusters.groupby(['Cluster', 'Room Type']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
neighbourhood_percentage = data_with_clusters.groupby(['Cluster', 'neighbourhood_group']).size().unstack(fill_value=0).div(data_with_clusters['Cluster'].value_counts(), axis=0) * 100
pd.set_option('display.max_columns', None)

print(cluster_stats)
print(room_type_percentage.T)
print(neighbourhood_percentage.T)

# Create five violin plots with the same category variable
fig, axes = plt.subplots(5, 1, figsize=(5, 20), sharey=False)

# Plot each numerical variable separately
for i, value_col in enumerate(['Latitude', 'Longitude', 'Price', 'Number of Reviews', 'Host Listings Count']):
    sns.violinplot(x='Cluster', y=value_col, data=data_with_clusters, ax=axes[i])
    axes[i].set_title(f'Violin Plot for {value_col}')

# Show the plots
plt.show()

# Visualize clusters using pairplot with different colors for each cluster
sns.pairplot(data_with_clusters, hue='Cluster', palette='viridis', vars=feature_names[1:])
plt.show()

# PCA Visualization with different colors for each cluster
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(X_airbnb_encoded)
data_with_clusters_pca = pd.DataFrame(np.column_stack([pca_result, clusters]), columns=['PCA Component 1', 'PCA Component 2', 'Cluster'])
sns.scatterplot(data=data_with_clusters_pca, x='PCA Component 1', y='PCA Component 2', hue='Cluster', palette='viridis', s=50, alpha=0.8)
plt.title('PCA Visualization of KMeans Clusters')
plt.show()

# K-means clusters based on latitude and longitude, colored by cluster using original dataset
plt.scatter(Airbnb['latitude'], Airbnb['longitude'], c=clusters, cmap='viridis', s=50, alpha=0.8)
plt.title('K-means Clusters Based on Latitude and Longitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

<ipython-input-47-fdb1565cae29>:58: FutureWarning:

['Room Type', 'neighbourhood_group'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.

Cluster                             0            1           2           3  \
Latitude            min     40.735500    40.506410   40.703720   40.499790   
                    max     40.913060    40.806460   40.790940   40.762150   
                    mean    40.801094    40.713863   40.729629   40.695217   
                    std      0.032203     0.037627    0.023925    0.034103   
Longitude           min    -73.999940   -74.244420  -74.017120  -74.242850   
                    max    -73.829320   -73.892100  -73.949100  -73.884260   
                    mean   -73.942200   -73.973348  -73.996995  -73.956593   
                    std      0.026907     0.028470    0.014972    0.034047   
Price               min     10.000000    10.000000  100.000000   10.000000   
                    max   1200.000000  2300.000000  699.000000  800.000000   
                    mean   115.393190   222.900616  273.928444   79.125092   
                    std     80.615230   189.914967  100.975965   47.500997   
Number of Reviews   min      0.000000     0.000000    0.000000    0.000000   
                    max    143.000000   109.000000   20.000000  100.000000   
                    mean    15.047928    12.729889    2.343470   12.164225   
                    std     22.204045    19.117426    3.775523   18.773961   
Host Listings Count min      1.000000     1.000000  232.000000    1.000000   
                    max    103.000000   121.000000  327.000000  103.000000   
                    mean     2.959102     5.396461  287.572451    3.255215   
                    std      8.898915    17.289021   46.850957    8.462379   

Cluster                             4            5             6  
Latitude            min     40.538710    40.565460     40.583630  
                    max     40.898110    40.866460     40.886710  
                    mean    40.723633    40.710982     40.736699  
                    std      0.050108     0.050251      0.047313  
Longitude           min    -74.169660   -73.900020    -74.097300  
                    max    -73.746270   -73.712990    -73.770690  
                    mean   -73.953674   -73.826488    -73.975251  
                    std      0.039081     0.044890      0.040526  
Price               min     10.000000    10.000000   2350.000000  
                    max   1050.000000  1500.000000  10000.000000  
                    mean   128.295350    95.379493   4435.075949  
                    std     83.072854    90.647730   2221.049843  
Number of Reviews   min     81.000000     0.000000      0.000000  
                    max    629.000000   212.000000     69.000000  
                    mean   157.777333    24.431642      2.886076  
                    std     66.336899    30.835174      8.522951  
Host Listings Count min      1.000000     1.000000      1.000000  
                    max     28.000000   103.000000     12.000000  
                    mean     1.977210     3.521494      2.721519  
                    std      1.894012     8.328940      3.569589  
                         0          1          2          3          4  \
Room Type                                                                
Entire home/apt  41.126529  98.451015  98.568873   0.510431  53.711118   
Private room     55.030126   0.808412   1.431127  96.279035  44.656606   
Shared room       3.843345   0.740573   0.000000   3.210534   1.632276   

                         5          6  
Room Type                              
Entire home/apt  37.279774  82.278481  
Private room     58.527132  17.721519  
Shared room       4.193094   0.000000  
                             0          1          2          3          4  \
neighbourhood_group                                                          
Bronx                 4.272412   1.582905   2.146691   1.494304   1.385895   
Brooklyn             20.887347  44.287410  20.572451  54.616067  47.859563   
Manhattan            60.096768  45.372831  65.116279  33.274153  39.944564   
Queens               14.186599   7.908870  11.806798   9.705578   9.947644   
Staten Island         0.547745   0.831025   0.357782   0.865513   0.862334   

                             5          6  
neighbourhood_group                        
Bronx                 2.924595   0.000000  
Brooklyn             31.465821  30.379747  
Manhattan            29.668781  55.696203  
Queens               35.271318  12.658228  
Staten Island         0.634249   1.265823  
In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from sklearn.metrics import r2_score
In [49]:
nyc_data = pd.read_csv('/content/AB_NYC_2019.csv')
In [50]:
nyc_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  object 
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB
In [51]:
nyc_data.head(10)
Out[51]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200 3 74 2019-06-22 0.59 1 129
6 5121 BlissArtsSpace! 7356 Garon Brooklyn Bedford-Stuyvesant 40.68688 -73.95596 Private room 60 45 49 2017-10-05 0.40 1 0
7 5178 Large Furnished Room Near B'way 8967 Shunichi Manhattan Hell's Kitchen 40.76489 -73.98493 Private room 79 2 430 2019-06-24 3.47 1 220
8 5203 Cozy Clean Guest Room - Family Apt 7490 MaryEllen Manhattan Upper West Side 40.80178 -73.96723 Private room 79 2 118 2017-07-21 0.99 1 0
9 5238 Cute & Cozy Lower East Side 1 bdrm 7549 Ben Manhattan Chinatown 40.71344 -73.99037 Entire home/apt 150 1 160 2019-06-09 1.33 4 188
In [52]:
nyc_data.isnull().sum()
Out[52]:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

Convert the non numerical field to categorical

In [53]:
nyc_data['neighbourhood_group']= nyc_data['neighbourhood_group'].astype("category").cat.codes
nyc_data['neighbourhood'] = nyc_data['neighbourhood'].astype("category").cat.codes
nyc_data['room_type'] = nyc_data['room_type'].astype("category").cat.codes
nyc_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  int8   
 5   neighbourhood                   48895 non-null  int16  
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  int8   
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  object 
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int16(1), int64(7), int8(2), object(3)
memory usage: 5.0+ MB
In [54]:
plt.figure(figsize=(10,10))
sns.distplot(nyc_data['price'], fit=norm)
plt.title("Price Distribution Plot",size=15, weight='bold')
<ipython-input-54-63eaf33967f1>:2: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Out[54]:
Text(0.5, 1.0, 'Price Distribution Plot')

The above distribution graph shows that there is a right-skewed distribution on price. This means there is a positive skewness. We used Log transformation to make this feature less skewed. Since division by zero is a problem, log+1 transformation would be better.

In [55]:
nyc_data['price_log'] = np.log(nyc_data.price+1)
In [56]:
plt.figure(figsize=(12,10))
sns.distplot(nyc_data['price_log'], fit=norm)
plt.title("Log-Price Distribution Plot",size=15, weight='bold')
<ipython-input-56-41f083519966>:2: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Out[56]:
Text(0.5, 1.0, 'Log-Price Distribution Plot')

The good fit indicates that normality is a reasonable approximation.

In [57]:
plt.figure(figsize=(7,7))
stats.probplot(nyc_data['price_log'], plot=plt)
plt.show()

non-nominal data and old price feature are eliminated.

In [58]:
nyc_model = nyc_data.drop(columns=['name','id' ,'host_id','host_name',
                                   'last_review','price'])
nyc_model.isnull().sum()
Out[58]:
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
price_log                             0
dtype: int64

Number of reviews has some missing data. The missing data can be replaced with mean. Since the data is more symmetric, mean replacement makes sense.

In [59]:
mean = nyc_model['reviews_per_month'].mean()
nyc_model['reviews_per_month'].fillna(mean, inplace=True)
nyc_model.isnull().sum()
Out[59]:
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
price_log                         0
dtype: int64

A correlation matrix is created with the Pearson method

In [60]:
plt.figure(figsize=(15,12))
palette = sns.diverging_palette(20, 220, n=256)
corr=nyc_model.corr(method='pearson')
sns.heatmap(corr, annot=True, fmt=".2f", cmap=palette, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}).set(ylim=(11, 0))
plt.title("Correlation Matrix",size=15, weight='bold')
Out[60]:
Text(0.5, 1.0, 'Correlation Matrix')

The correlation table shows that there is no strong relationship between price and other features. This indicates no feature needed to be taken out of data.

In [61]:
nyc_model_x, nyc_model_y = nyc_model.iloc[:,:-1], nyc_model.iloc[:,-1]
In [62]:
# removes the mean and scales each feature/variable to unit variancescaler = StandardScaler()
scaler = StandardScaler()
nyc_model_x = scaler.fit_transform(nyc_model_x)

Data is split in a 70–30 ratio

In [63]:
X_train, X_test, y_train, y_test = train_test_split(nyc_model_x, nyc_model_y, test_size=0.3,random_state=42)

Model Building¶

Build a Linear Regression, Ridge Regression, Lasso Regression, and ElasticNet Regression. The models will be used to avoiding plain Linear Regression and show the results with a little of regularization.

GridSearchCV algorithm will be used to find the best parameters and tuning hyperparameters for each model. In this algorithm 5-Fold Cross Validation and Mean Squared Error Regression Loss metrics will be used.

K-Fold Cross Validation¶

Before model building, 5-Fold Cross Validation will be implemented for validation.

In [64]:
kfold_cv=KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in kfold_cv.split(nyc_model_x,nyc_model_y):
    X_train, X_test = nyc_model_x[train_index], nyc_model_x[test_index]
    y_train, y_test = nyc_model_y[train_index], nyc_model_y[test_index]

Model Prediction¶

In [65]:
# Identify the indices where y_train is not NaN
valid_indices = ~np.isnan(y_train)

# Filter out corresponding rows in X_train and y_train
X_train_clean = X_train[valid_indices]
y_train_clean = y_train[valid_indices]
X_train = X_train_clean
y_train = y_train_clean
In [66]:
##Linear Regression
lr = LinearRegression(copy_X= True, fit_intercept = True)
lr.fit(X_train, y_train)
lr_pred= lr.predict(X_test)

#Ridge Model
ridge_model = Ridge(alpha = 0.01)
ridge_model.fit(X_train, y_train)
pred_ridge = ridge_model.predict(X_test)

#Lasso Model
Lasso_model = Lasso(alpha = 0.001)
Lasso_model.fit(X_train, y_train)
pred_Lasso = Lasso_model.predict(X_test)

#ElasticNet Model
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train)
pred_test_enet= model_enet.predict(X_test)

Model Prediction¶

In [67]:
###Linear Regression
lr_x = LinearRegression(copy_X= True, fit_intercept = True)
lr_x.fit(X_train, y_train)
lr_pred_x= lr_x.predict(X_test)

###Ridge
ridge_x = Ridge(alpha = 0.01)
ridge_x.fit(X_train, y_train)
pred_ridge_x = ridge_x.predict(X_test)

###Lasso
Lasso_x = Lasso(alpha = 0.001)
Lasso_x.fit(X_train, y_train)
pred_Lasso_x = Lasso_x.predict(X_test)

##ElasticNet
model_enet_x = ElasticNet(alpha = 0.01)
model_enet_x.fit(X_train, y_train)
pred_train_enet_x= model_enet_x.predict(X_train)
pred_test_enet_x= model_enet_x.predict(X_test)

Model Comparison¶

Metrics to evaluate predictions.

  • Mean Absolute Error (MAE) shows the difference between predictions and actual values.

  • Root Mean Square Error (RMSE) shows how accurately the model predicts the response.

  •               ``R^2``  will be calculated to find the goodness of fit measure.
    
In [68]:
print('-------------Lineer Regression-----------')
print('MAE: %f'% mean_absolute_error(y_test, lr_pred))
print('RMSE: %f'% np.sqrt(mean_squared_error(y_test, lr_pred)))
print('R2 %f' % r2_score(y_test, lr_pred))

print('---------------Ridge ---------------------')
print('MAE: %f'% mean_absolute_error(y_test, pred_ridge))
print('RMSE: %f'% np.sqrt(mean_squared_error(y_test, pred_ridge)))
print('R2 %f' % r2_score(y_test, pred_ridge))


print('---------------Lasso-----------------------')
print('MAE: %f' % mean_absolute_error(y_test, pred_Lasso))
print('RMSE: %f' % np.sqrt(mean_squared_error(y_test, pred_Lasso)))
print('R2 %f' % r2_score(y_test, pred_Lasso))


print('---------------ElasticNet-------------------')
print('MAE: %f' % mean_absolute_error(y_test,pred_test_enet)) #RMSE
print('RMSE: %f' % np.sqrt(mean_squared_error(y_test,pred_test_enet))) #RMSE
print('R2 %f' % r2_score(y_test, pred_test_enet))
-------------Lineer Regression-----------
MAE: 0.370140
RMSE: 0.523551
R2 0.446063
---------------Ridge ---------------------
MAE: 0.370140
RMSE: 0.523551
R2 0.446063
---------------Lasso-----------------------
MAE: 0.370153
RMSE: 0.523620
R2 0.445918
---------------ElasticNet-------------------
MAE: 0.370481
RMSE: 0.524137
R2 0.444822
  • The MAE value of 0 indicates no error on the model. In other words, there is a perfect prediction. The above results show that all predictions have some error
  • RMSE gives an idea of how much error the system typically makes in its predictions. The above results show that all models have some errors.
  • R2 represents the proportion of the variance for a dependent variable that's explained by an independent variable. The above results show that 44% of data fit the regression model
In [69]:
# Create a single row with 4 columns for the subplots
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 10))  # Adjusted to 1 row, 4 columns
fig.suptitle('True Values vs Predictions')

# Plot each model's predictions
ax1.scatter(y_test, lr_pred)
ax1.set_title('Linear Regression - Phase-1')

ax2.scatter(y_test, pred_ridge)
ax2.set_title('Ridge - Phase-1')

ax3.scatter(y_test, pred_Lasso)
ax3.set_title('Lasso - Phase-1')

ax4.scatter(y_test, pred_test_enet)
ax4.set_title('ElasticNet - Phase-1')

# Set labels for each axis in all subplots
for ax in [ax1, ax2, ax3, ax4]:
    ax.set(xlabel='True Values', ylabel='Predictions')

# Display the plot
plt.show()
In [69]: